import json
import requests
import csv
import pandas as pd
import time
import seaborn as sns
import numpy as np
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from textblob import TextBlob
import nltk
from nltk import word_tokenize
nltk.download('punkt')
from nltk.probability import FreqDist
import re
#nltk.download('stopwords')
#from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import operator #module defines functions that correspond to the concept of getters.
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.style.use('ggplot')
import plotly
import plotly.express as px
import plotly.offline as pyo #with this I will be able to plot charts offline
import plotly.graph_objects as go #using plotly graph objects
from IPython.display import display
plotly.offline.init_notebook_mode()
import warnings
warnings.filterwarnings('ignore')
df1 = pd.read_csv('data1_file.csv')
df2 = pd.read_csv('data2_file.csv')
n_data = pd.read_csv("netflix_titles.csv")
print(df1.shape)
print(df2.shape)
print(n_data.shape)
#merging two dataframes
netflix_data = df1.append(df2)
netflix_data['country']=netflix_data['country'].fillna('usa')
netflix_data.isnull().sum()
#checking the data column values to see if we have any null values
print(n_data.isnull().sum())
print(n_data.shape)
#dropping some unwanted columns and renaming column name
netflix_dat = netflix_data.drop(columns=['image','largeimage','download','imdbid'], axis= 1)
netflix_dat.rename(columns={'synopsis':'description','released':'release_year'}, inplace =True)
netflix_dat
n_data.rename(columns={'show_id':'netflixid'}, inplace =True)
n_data
print()
print(netflix_dat.shape)
print(n_data.shape)
#check for duplicates
n_data_filter =n_data.drop_duplicates(subset=['netflixid','title'])
netflix_data_filter =netflix_dat.drop_duplicates(subset=['netflixid','title'])
print(netflix_data_filter.duplicated().any()) #again checking if any duplicates left
print(n_data_filter.duplicated().any()) # checking if any duplicates left for another dataframe
print(n_data_filter.shape)
print(netflix_data_filter.shape)
n_data_filter.head(2)
netflix_data_filter.head(2)
## extracting dates from the date columns and making ne columns
n_data_filter["date_added"] = pd.to_datetime(n_data_filter['date_added'])
n_data_filter['added_year'] = n_data_filter['date_added'].dt.year
n_data_filter['added_month'] = n_data_filter['date_added'].dt.month
n_data_filter
#netflix_dat["unogsdate"] = pd.to_datetime(netflix_dat['unogsdate']) #this gave me error so I had to look into the the column
#netflix_dat["date_filter"] = netflix_dat["unogsdate"].apply(lambda x: len(x))
#netflix_dat_extra = netflix_dat[netflix_dat["date_filter"] != 10]
#netflix_dat = netflix_dat.drop(3200)
netflix_data_filter["unogsdate"] = pd.to_datetime(netflix_data_filter['unogsdate'])
netflix_data_filter['added_year'] = netflix_data_filter['unogsdate'].dt.year
netflix_data_filter['added_month'] = netflix_data_filter['unogsdate'].dt.month
netflix_data_filter
#selecting specific columns
rating_view = netflix_data_filter.loc[:,['netflixid','rating']]
rating_view.rename(columns={'rating':'rating_num'}, inplace =True)
#print(rating_view.shape)
#merging data frame
merge_data = pd.merge(n_data_filter, rating_view, on='netflixid', how='inner')
#print(merge_data.shape) #checking the rows and columns of the df
#checking for null values in each columns
merge_data.rating_num.isnull().sum()
merge_data
merge_data.isnull().sum()
merged_list = merge_data.netflixid.to_list() #making a list of netflixid
#print(len(merged_list))#530
rating_data_list = rating_view.netflixid.to_list() #making a list of netflixid from another dataframe
#print(len(rating_data_list))#764
data_to_add =[] #making empty list
for i in rating_data_list: #using for loop to to get the netflixid that is not in the merged dataframe
if i not in merged_list:
data_to_add.append(i) #appending list with the missed netflixid
#data_to_add #prints list of netflixid
#setting index
netflix_data_filter_set= netflix_data_filter.set_index("netflixid")
netflix_data_filter_set
#making a new df and resetting index
data_add_df = netflix_data_filter_set.loc[data_to_add]
data_add_df_set = data_add_df.reset_index()
data_add_df_set.head(1)
#renaming columns
add_df_rename =data_add_df_set.rename(columns={'rating':'rating_num',
'runtime':'duration'})
#merging two dataframes to include all netflixid
rating_data_overall = merge_data.append(add_df_rename, ignore_index = True)
rating_data_overall.head(2)
copy_count_rated = rating_data_overall.copy() #making copy of the dataframe
copy_count_rated.rating_num = copy_count_rated.rating_num.fillna(0) #filling null values with zero for easy processing
#copy_count_rated.rating_num.value_counts() #counts value for specific columns
copy_count_rated['rating_standard'] = 'N/A' #making a new column in dataframe
copy_count_rated
# my comparison standard for high-rating is 7.0 so any movie/show that is equal or more than 7.0 is high-rated and
#any movie/show that is less than 7.0 is low-rated.
#using for loop to make a new column and specify the movie/show as high-rated or low-rated
for each in range(len(copy_count_rated.rating_num)):
if copy_count_rated.rating_num[each] >=7.0:
copy_count_rated['rating_standard'][each] = 'High-rated'
elif copy_count_rated.rating_num[each] <7.0: #and copy_count_rated.rating_num[each] >=1:
copy_count_rated['rating_standard'][each] = 'Low-rated'
print(copy_count_rated.rating_standard.value_counts()) #printing total number of counts of high-rated and low-rated movie
copy_count_rated.head(2)
#Plotly chart - Bar chart to see the count of movies and shows in our data
#x-axis will show the content type and y-axis will show the count of content
color = ['steelblue','firebrick']
data = [go.Bar(x=['High-rated','Low-rated'],
y=[copy_count_rated.loc[copy_count_rated['rating_standard']=='High-rated'].shape[0],
copy_count_rated.loc[copy_count_rated['rating_standard']=='Low-rated'].shape[0]],
#marker_color =copy_count_rated['rating_num']
marker=dict(color=color)
)]
#create the layout of the chart by defining titles for chart, x-axis and y-axis
layout = go.Layout(title='Netflix content rating analysis',
xaxis=dict(title='Type of ratings'),
yaxis=dict(title='Total no. of ratings'),
height =500,
width = 700)
#Imbed data and layout into charts figure using Figure function
fig = go.Figure(data=data, layout=layout)
#Use plot function of plotly to visualize the data
fig.show()
#making copy to new dataframe so that the edit do not change the original dataframe
copy_count_rated_country =copy_count_rated.copy()
#replacing abbreviation of country names with their full name
copy_count_rated_country = copy_count_rated_country.replace({'country': {'gb':'United Kingdom', 'ar':'Argentina', 'hk':'Hongkong','be':'Belgium','hu':'Hungary','cz':'cezhrepublic',
'de':'Germany','jp':'Japan','se':'Sweden','ru':'Russia','au':'Australia','nl':'Netherland','usa':'United States','in':'India',
'lt':'Luthvania','br':'Brazil','mx':'Mexico','sg':'Singapore','fr':'France','kr':'South Korea','it':'Italy'}})
copy_count_rated_country.head(1)
#acessing specific columns
rating_analysis_country_filter = copy_count_rated_country.loc[:,['netflixid','type','title','country','rating','duration','listed_in','description','rating_num','rating_standard']]
print(rating_analysis_country_filter.shape)
rating_analysis_country_filter.head(3)
#country_high_rating_count = x
# in country column there are more than one country in a row, so making separate rows for each country
#so creating a function and using for loop to separate the each country in different rows
def get_each_country(x):
ids_list =[]
countries_df=pd.DataFrame() #making a new empty dataframe
x['country'] = x['country'].astype(str) #making sure that all values in country columns are in string format
for i in range(len(x)): #using for loop
nid = x.iloc[i,0] #getting particular values
typ = x.iloc[i,1] #getting particular values
titl=x.iloc[i,2] #getting particular values
value = x.iloc[i,3] #getting particular values
rating_typ = x.iloc[i,4] #getting particular values
dura = x.iloc[i,5]
genre = x.iloc[i,6]
desc = x.iloc[i,7]
rate = x.iloc[i,8]
rate_std = x.iloc[i,9] #getting particular values
if ',' in value: #checking if comma is in the specific row value
splitted= value.split(',') #splitting the value separting with commas
ids_list.append(nid) #getting id of that value
if len(splitted)==2: #checking how many values were separated with commas (condition with 2 values)
#making a list of series to append later in new dataframe
listOfSeries = [pd.Series([nid, typ, titl, splitted[0].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns ) , #making a complete row
pd.Series([nid, typ, titl, splitted[1].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns )] #making a complete row
elif len(splitted) ==3: # 3 country names separated with commas
listOfSeries = [pd.Series([nid, typ, titl, splitted[0].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns ) ,#making a complete row
pd.Series([nid, typ, titl, splitted[1].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns ),#making a complete row
pd.Series([nid, typ, titl, splitted[2].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns )]#making a complete row
else: #more tha three countries
listOfSeries = [pd.Series([nid, typ, titl, splitted[0].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns ) , #making a complete row
pd.Series([nid, typ, titl, splitted[1].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns ),#making a complete row
pd.Series([nid, typ, titl, splitted[2].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns ), #making a complete row
pd.Series([nid, typ, titl, splitted[3].strip(),rating_typ, dura,genre,desc,rate,rate_std], index=x.columns )]#making a complete row
countries_df = countries_df.append(listOfSeries , ignore_index=True) #appending all these rows in a new dataframe
#print(len(id_list))
#print(len(country_df))
#print(country_df)
countries_index = x.set_index('netflixid') #setting index in the dataframe that is passed to this funntion
#country_index
for each in range(len(x)):
if x.iloc[each][0] in ids_list: #checking if the each id is in the appended list above
val = x.iloc[each][0] #getting id as val
countries_index = countries_index.drop(val,axis=0) #dropping the val(id)from the dataframe
else:
continue
countries_index = countries_index.reset_index() #resetting the index
countries_index = countries_index.append(countries_df, ignore_index =True) #appending the index
return countries_index #returning the dataframe
rating_analysis_expand = get_each_country(rating_analysis_country_filter) #calling above function to get each country rows separate
#print(rating_analysis_expand.shape)
rating_analysis_expand.head(3)
#len(rating_analysis_expand.rating_standard.to_list())
#for i in range(len(rating_analysis_expand.rating_standard.to_list())):
# print(rating_analysis_expand.rating_standard[i])
# getting the rows that belongs to high-rated
country_high_rating = rating_analysis_expand[rating_analysis_expand.rating_standard == 'High-rated']
#getting with high-rated
country_high_rating_count= country_high_rating.loc[:,['netflixid','country']]
high_rating_by_country = country_high_rating_count.rename(columns={'country':'high-rating_country'})
high_rating_by_country
#counting countries that has more high-rated
ccount_high = high_rating_by_country['high-rating_country'].value_counts()
#ccount_high
#getting the rows that belongs to low-rated
country_low_rating = rating_analysis_expand[rating_analysis_expand.rating_standard == 'Low-rated']
country_low_rating_count= country_low_rating.loc[:,['netflixid','country']]
#getting countries with low-rated and renaming the columns
low_rating_by_country = country_low_rating_count.rename(columns={'country':'low-rating_country'})
#counting the total number of low-rated movies in that country
ccount_low = low_rating_by_country['low-rating_country'].value_counts()
#concating high -rated dataframe and lowrated dataframe
rating_compare = pd.concat([ccount_high, ccount_low], axis=1)
rating_compare =rating_compare.reset_index()
#rating_compare
#renaming the columns of concatenated dataframe
rating_compare_e =rating_compare.rename(columns={'index':'country',
'high-rating_country':'High-rated',
'low-rating_country':'Low-rated'})
rating_compare_e.iloc[:]
#using melt functionality to set the columna name as valuues
rating_compare_edit = rating_compare_e.melt(id_vars=['country'], value_vars=['High-rated', 'Low-rated'])
#rating_compare_edit.columns.values
#rating_compare_edit.country.to_list()
#using iloc to get specific countries
rating_compare_country = rating_compare_edit.iloc[[0,53,1,54,2,55,3,56,4,57,5,58,6,59,7,60,12,65],:]
rating_compare_country
rating_compare_country
#plotting the data into sunburst using plotly
fig = px.sunburst(rating_compare_country, path=['country', 'variable'], values='value', color='country',
title='Analysis of Netflix ratings by country', height = 700, width =900)
fig.show()
# getting a glance at types of genres we have in our dataframe
rating_analysis_expand.listed_in.to_list()
rating_analysis_expand.columns.values
# in our datafraem we noticed that we had many genres in each row so separating it to process and analyze it.
#so using the function to separate the values by genre and making them individual rows
def get_each_genre(x):
id_list =[]
genre_df=pd.DataFrame() #creaing new dataframe
x['listed_in'] = x['listed_in'].astype(str) #converting ito string format
for i in range(len(x)): #using for loop
nid = x.iloc[i,0] #accesing specific value
typ = x.iloc[i,1] #accesing specific value
titl=x.iloc[i,2] #accesing specific value
country = x.iloc[i,3] #accesing specific value
rating_typ = x.iloc[i,4]#accesing specific value
dura = x.iloc[i,5] #accesing specific value
value = x.iloc[i,6] #accesing specific value
desc = x.iloc[i,7] #accesing specific value
rate = x.iloc[i,8] #accesing specific value
rate_std = x.iloc[i,9] #accesing specific value
if ',' in value: #checking for the comma in the value to split it accordingly
splitted= value.split(',') #splotting th value
id_list.append(nid)
if len(splitted)==2: #making a new series for more than two generes in each row
listOfSeries = [pd.Series([nid, typ, titl, country,rating_typ, dura,splitted[0].strip(),desc,rate,rate_std], index=x.columns ) ,
pd.Series([nid, typ, titl, country,rating_typ, dura,splitted[1].strip(),desc,rate,rate_std], index=x.columns )]
elif len(splitted) ==3: #making a new series for more than two generes in each row
listOfSeries = [pd.Series([nid, typ, titl, country,rating_typ, dura,splitted[0].strip(),desc,rate,rate_std], index=x.columns ) ,
pd.Series([nid, typ, titl, country,rating_typ, dura,splitted[1].strip(),desc,rate,rate_std], index=x.columns ),
pd.Series([nid, typ, titl, country,rating_typ, dura,splitted[2].strip(),desc,rate,rate_std], index=x.columns )]
genre_df = genre_df.append(listOfSeries , ignore_index=True) #appending the dataframe with new rows
#print(len(id_list))
#print(len(country_df))
#print(country_df)
countries_genre_index = x.set_index('netflixid') #setting the index
#countries_genre_index
id_list_final = set(id_list) #removing duplicate items in the list by making it set type
for each in id_list_final:
countries_genre_index = countries_genre_index.drop(each, axis = 0) #dropping the rows that have been repeated in another dataframe
countries_genre_index = countries_genre_index.reset_index() #resetting index
countries_genre_index = countries_genre_index.append(genre_df, ignore_index =True) #appending the new dataframe
return countries_genre_index #returning the cleaned dataframe
rating_analysis_genre = get_each_genre(rating_analysis_expand) #calling the function to process the genre columns
rating_analysis_genre
# there are too many genres so processing them to major genres accordingly by replace functionality
rating_analysis_genre['listed_in'] = rating_analysis_genre['listed_in'].replace(['Movie','Classic Movies',
'Kid\'s TV', 'Teens TV','Teen TV Shows','Kids\' TV',
'Stand-Up Comedy','TV Comedies','Stand-Up Comedy & Talk Shows',
'TV Dramas','Docuseries','TV Action & Adventure','Crime TV Shows',
'Children & Family Movies', 'Faith & Spirituality',
'Independent Movies','LGBTQ Movies',
'Thrillers','Horror','TV Horror','TV Thrillers',
'TV Sci-Fi & Fantasy','Romantic Moives','Romantic TV Shows',
'TV Mysteries','British TV Shows','Classic & Cult TV','Korean TV Shows','Reality TV',
'Anime Features'],
['International Movies','International Movies',
'Kid\'s and Teen','Kid\'s and Teen','Kid\'s and Teen', 'Kid\'s and Teen',
'Comedies','Comedies','Comedies',
'Dramas','Documentaries','Action & Adventure','Crime',
'Family Movies','Family Movies',
'Independent Movies and LGBTQ Movies','Independent Movies and LGBTQ Movies',
'Thrillers & Horror','Thrillers & Horror','Thrillers & Horror','Thrillers & Horror',
'Sci-Fi & Fantasy','Romantic','Romantic',
'Other Shows','Other Shows','Other Shows','Other Shows','Other Shows',
'Anime Series'])
#also replacing movie type to Movie type to make the data similar and comparable
rating_analysis_genre['type'] = rating_analysis_genre['type'].replace('movie','Movie')
#using treemap to analyze the quality content of netflix in each country by genre
fig = px.treemap(rating_analysis_genre, path=['country','listed_in','rating_standard'],
title ='Content analysis of each country by genre',
color='rating_num', hover_data=['rating_num'],
color_continuous_scale='RdBu')
fig.show()
#making a nes dataframe by selecting specific columns
duration_analysis = rating_analysis_genre.loc[:,['netflixid','type','duration','rating_standard','rating_num']]
# duration_analysis #printing duration analysis
#getting specific rows where we find the movie because we are anlyzing the length of movie
movie_analysis = duration_analysis.loc[(duration_analysis['type'] =='Movie')]
#checking for the duplicates and drop the duplicated items in the dataframe
movie_analysis_edited = movie_analysis.drop_duplicates(subset=['netflixid'])
#splitting the duration column to actual duration and to units of duration measurrement(mins)
movie_analysis_edited[['duration_in_mins','duration_units']] = movie_analysis_edited.duration.str.split(" ",expand=True,)
#droping nan values
movie_analysis_edited = movie_analysis_edited.dropna()
#but all the values in duration column was not in same format so it wasnot easily splitted
#dataframe of high-rated movies with their duration
high_movie_analysis_edited = movie_analysis_edited[movie_analysis_edited.rating_standard == 'High-rated']
#duration column had different formatted values so used regular expression to ge the float numbers of out it.
s= movie_analysis_edited.duration.to_list() #making a list
lis=[]
to_sum=[]
for each in s:
counter = 0
for each in re.findall(r'\d+',each): #finding all the values using regular expression
if counter == 0: #setting the condition
x =float(each) #converting it to float
if x ==1:
x = 60
elif x ==2:
x =120
sume = x #assigning the value
#print(f"x is{x}")
#print(f"sume is {sume}")
else:
y = float(each) #converting to float
#print(y)
sume=sume+y
#print(f"sume is {sume}")
counter = counter +1
#print(counter)
lis.append(sume) #appendig the list
movie_analysis_edited['duration_in_min'] = lis #making a new column with the appended list to get the float numbers that represnet the duration
#movie_analysis_edited #prints the dataframe
#plotting the histogram of distribution of movies with both high-standard and low-standard to analyze their distribution pattern
fig = px.histogram(movie_analysis_edited, x="duration_in_min", color="rating_standard",
nbins=18,title='Distribution of duration of movies by rating standard', marginal="box") # can be `box`, `violin`)
fig.show()
#plotting histogram to analyze the distribution of duration of high-rated movies
fig = px.histogram(high_movie_analysis_edited, x="duration_in_mins",
title='Distribution of duration of high-rated movies', marginal="box",nbins=18)
fig.show()
#analyzing the realation of duration and rating
# to see if increase or decrease in length of movie would impact the rating
fig = px.scatter(movie_analysis_edited, x="duration_in_min", y="rating_num", color="rating_standard",
title = 'Analysis of co-relation between rating and duration',
hover_data=['type'])
fig.show()
#finding co-relation to see if the duration variable and rating variable are co-related
from scipy.stats import pearsonr
# Convert dataframe into series
list1 = movie_analysis_edited['duration_in_min']
list2 = movie_analysis_edited['rating_num']
# Apply the pearsonr()
corr, _ = pearsonr(list1, list2)
print('Pearsons correlation: %.3f' % corr)
correlation = list1.corr(list2)
correlation
# n_data_filter.head(1) #taking a glance at the dataframe
#accessing specifc columns
data_rating_class= n_data_filter.loc[:,['netflixid','rating','country']]
data_rating_class.head(1)
# lets see which type of rating movie or TV show the netflix has the most
rating_value_counts = data_rating_class.rating.value_counts()
rating_value_counts
aliceblue, antiquewhite, aqua, aquamarine, azure, beige, bisque, black, blanchedalmond, blue, blueviolet, brown, burlywood, cadetblue, chartreuse, chocolate, coral, cornflowerblue, cornsilk, crimson, cyan, darkblue, darkcyan, darkgoldenrod, darkgray, darkgrey, darkgreen, darkkhaki, darkmagenta, darkolivegreen, darkorange, darkorchid, darkred, darksalmon, darkseagreen, darkslateblue, darkslategray, darkslategrey, darkturquoise, darkviolet, deeppink, deepskyblue, dimgray, dimgrey, dodgerblue, firebrick, floralwhite, forestgreen, fuchsia, gainsboro, ghostwhite, gold, goldenrod, gray, grey, green, greenyellow, honeydew, hotpink, indianred, indigo, ivory, khaki, lavender, lavenderblush, lawngreen, lemonchiffon, lightblue, lightcoral, lightcyan, lightgoldenrodyellow, lightgray, lightgrey, lightgreen, lightpink, lightsalmon, lightseagreen, lightskyblue, lightslategray, lightslategrey, lightsteelblue, lightyellow, lime, limegreen, linen, magenta, maroon, mediumaquamarine, mediumblue, mediumorchid, mediumpurple, mediumseagreen, mediumslateblue, mediumspringgreen, mediumturquoise, mediumvioletred, midnightblue, mintcream, mistyrose, moccasin, navajowhite, navy, oldlace, olive, olivedrab, orange, orangered, orchid, palegoldenrod, palegreen, paleturquoise, palevioletred, papayawhip, peachpuff, peru, pink, plum, powderblue, purple, red, rosybrown, royalblue, rebeccapurple, saddlebrown, salmon, sandybrown, seagreen, seashell, sienna, silver, skyblue, slateblue, slategray, slategrey, snow, springgreen, steelblue, tan, teal, thistle, tomato, turquoise, violet, wheat, white, whitesmoke, yellow, yellowgreen
#setting colors
color = ['salmon', 'firebrick', 'aqua', 'mediumorchid', 'orangered',
'limegreen', 'gold', 'tomato', 'magenta', 'blue',
'blueviolet', 'brown', 'burlywood', 'cadetblue',
'chartreuse']
#using bar plotly
data = [go.Bar(x=['TV-MA','TV-14','TV-PG','R','PG-13','NR','PG','TV-Y7','TV-G','TV-Y','TV-Y7-F7','G','UR','NC-17'],
#setting y to be value count of each rating type
y=[data_rating_class.loc[data_rating_class['rating']=='TV-MA'].shape[0],
data_rating_class.loc[data_rating_class['rating']=='TV-14'].shape[0],
data_rating_class.loc[data_rating_class['rating']=='TV-PG'].shape[0],
data_rating_class.loc[data_rating_class['rating']=='R'].shape[0],
data_rating_class.loc[data_rating_class['rating']=='PG-13'].shape[0],
data_rating_class.loc[data_rating_class['rating']=='NR'].shape[0],
data_rating_class.loc[data_rating_class['rating']=='PG'].shape[0],
data_rating_class.loc[data_rating_class['rating']=='TV-Y7'].shape[0],
data_rating_class.loc[data_rating_class['rating']=='TV-G'].shape[0],
data_rating_class.loc[data_rating_class['rating']=='TV-Y'].shape[0],
data_rating_class.loc[data_rating_class['rating']=='TV-Y7-F7'].shape[0],
data_rating_class.loc[data_rating_class['rating']=='G'].shape[0],
data_rating_class.loc[data_rating_class['rating']=='UR'].shape[0],
data_rating_class.loc[data_rating_class['rating']=='NC-17'].shape[0]],
marker=dict(color=color)
)]
#create the layout of the chart by defining titles for chart, x-axis and y-axis
layout = go.Layout(title='Netflix content rating analysis',
xaxis=dict(title='Type of ratings'),
yaxis=dict(title='Total no. of ratings'),
height =500,
width = 700)
#embedding data and layout into charts figure using Figure function
fig = go.Figure(data=data, layout=layout)
#Use plot function of plotly to visualize the data
fig.show()
rating_analysis_expand.head(2) #printing two rows of data frame to see what the data frame looks like
#making a copy of dataframe
rating_analysis_by_country = rating_analysis_expand.copy()
#filling empty values with not available for ease of analysis
rating_analysis_by_country=rating_analysis_by_country.fillna('N/A')
#checking if the dataframe now has null values
rating_analysis_by_country.isnull().sum()
#plotting sunbrust plotly to see the type of rating type by the country
fig = px.sunburst(rating_analysis_by_country, path=['country', 'rating'], color='country',
title='Analysis of Netflix ratings by country', height = 650, width =900)
fig.show()
# coming up with some popular words among all titles for high-rated movie/shows
country_high_rating.head(1)
#country_high_rating[country_high_rating.duplicated(['netflixid'])]
#opening image in numpy array format to shape word cloud in this saved image
#to do that we need to check the intensity of pixels, which acn be done by opening image in numpy format
TV_mask = np.array(Image.open("TV.jpg"))
#TV_mask
#mask has to be in 255 pixels to use it in word cloud
# our mask is in correct form so, let's create text for wordcloud
text = " ".join(country_high_rating['title'])
#creating a word cloud image
wc = WordCloud(background_color='black',max_words =500, mask = TV_mask,
contour_width =3, contour_color = 'red')
#generate a wordcloud
wc.generate(text)
#show
plt.figure(figsize=[20,10])
plt.title("Wordcloud for popoular words in titles", fontsize =30)
plt.imshow(wc)
plt.axis('off')
plt.show()
# Similarly some popular directors from high rated movies/shows
#accessing specific columns
rated_directorANDactor = copy_count_rated.loc[:,['director','cast','rating_standard','title']]
#accessing specific rows
high_rated_directorANDactor = rated_directorANDactor.loc[rated_directorANDactor.rating_standard == 'High-rated']
#dropping nan values
high_rated_director = high_rated_directorANDactor.director.dropna()
#high_rated_director
# our mask is in correct form so, let's create text for wordcloud
text = " ".join(high_rated_director)
#creating a word cloud image with maximum words of 200
wc = WordCloud(background_color='black',max_words =200, mask = TV_mask,
contour_width =3, contour_color = 'red')
#generate a wordcloud
wc.generate(text)
#show
plt.figure(figsize=[25,8])
plt.title("Popular directors", fontsize =25)
plt.imshow(wc)
plt.axis('off')
plt.show()
#having some information related with director Shannon Hartman
high_rated_directorANDactor[high_rated_directorANDactor['director'].str.match('^Shannon Hartman*')== True][:2]
I would also like to analyze if there is any specific pattern in description of movie/show. Can we tell which genre will that movie/show belong by looking at the descritption? If yes, are there any commonalities we can find in that gerne's?
rating_analysis_genre
#getting speciifc columns needed for analysis
rating_analysis_genre_edit =rating_analysis_genre.loc[:,['netflixid','listed_in','description','rating_standard']]
#looking for duplicates and keeping the first records if there are any duplicates
analysis_genre = rating_analysis_genre_edit.drop_duplicates(subset='netflixid',keep= 'first')
#print(analysis_genre.shape) #will print the shape of data frame
#analysis_genre
#analysis_genre.listed_in.to_list() prints out the list of genres
#setting listed_in column as index
genre_pattern = analysis_genre.set_index('listed_in')
#getting popular genre descriptions like comedies, Action and adventures, Horror movies and Romantic
comedies_pattern = genre_pattern.loc[['Comedies']]
Action_Adventures_pattern = genre_pattern.loc[['Action & Adventure']]
Horror_pattern = genre_pattern.loc[['Horror Movies']]
Romantic_pattern = genre_pattern.loc[['Romantic']]
# defining a function to get popular words from thd description of that specific genre
#nltk.download('stopwords')
from nltk.corpus import stopwords
def get_pattern(x):
text = " ".join(x['description']) #joining all the values of Description COLUMN as a text
# using word_tokenize() for splitting strings into tokens (nominally words).
#It splits tokens based on white space and punctuation. For example, commas and periods are taken as separate tokens.
textwords = word_tokenize(text)
#setting stopwords
stop_words = set(stopwords.words("english"))
stop_words.update(['series','finds']) #updating stopwords because as these words might not have specifc meaning
exclud_punc=[] #making a new list
finl_words=[] #making a new list
for w in textwords: #using a for loop in tokenized text
if w.isalpha(): # checking whether a character is an alphabet or not
exclud_punc.append(w.lower()) #making sure that all the strings are lower case and appeding to the new list and
for word in exclud_punc: # using for loop in the appended list
if word not in stop_words: #checking if the words are contained in stop_words
finl_words.append(word) #appedning the filtered words, which are not in stop words
return finl_words #returning the appended list
comedy_word_list = get_pattern(comedies_pattern) #calling a function to get popular words for comedy genre
action_Adv_list = get_pattern(Action_Adventures_pattern) #calling a function to get popular words for action and adventure genre
horror_list =get_pattern(Horror_pattern) #calling a function to get popular words for horror genre
romantic_list = get_pattern(Romantic_pattern) #calling a fucntion to get popular words for romantic genre
#w1 = WordCloud(max_font_size=50, max_words=150, colormap="Oranges_r").generate(horror_list_wc)
#wordcloud2 = WordCloud().generate(action_Adv_wc)
#making a list of the lists
list_words = [horror_list,action_Adv_list, comedy_word_list, romantic_list]
#making a list for the tile
title=['horror genre','action and adventure genre','comedy genre','romantic genre']
j =0 #setting the counter
for i in list_words: #using for loop in the lists
text = " ".join(i) #joining all the words from the list and making it like whole bag of words
w1 = WordCloud(max_font_size=50, max_words=150, colormap="Oranges_r").generate(text) #generating wordcloud
plt.figure(figsize = (10, 8))
plt.imshow(w1)
plt.title(f"Popoular words for {title[j]}", fontsize=20) #setting title
#plt.imshow(wordcloud2)
plt.axis("off")
plt.show() #plotting
j= j+1 #increasing counter
I have seen some people who are only tempted to see the movie which have positive sentiments. Like my aunt only watches the movie which gives her positive influence. She reads the description of movie and if the movie description has word like violence (negative vibes), she will ignore that. So lets analyze the sentiment of the description of title.
#getting specific columns from the dataframe
copy_count_rated_country
synopsis_analysis = copy_count_rated_country.loc[:,['description', 'rating_num','rating_standard','rating']]
synopsis_analysis = synopsis_analysis.set_index(['description']) #setting index
synopsis_analysis = synopsis_analysis.reset_index() #resetting index
synopsis_analysis.head(2)
#making a new list
value_list =[]
sentiment_list=[]
#using for loop to iterate
for each in range(len(synopsis_analysis)):
try:
text = synopsis_analysis['description'][each] #getting specific value of description column
analysis = TextBlob(text) #using python library to process the textual data.
value = analysis.sentiment.polarity #analyzing the sentiment of text
value_list.append(value)
if value > 0: #setting the condition if the sentiment is greater than 0 to be positive
sentiment = 'positive'
elif value == 0: #setting the condition if the sentiment is 0 to be neutral
sentiment ='neutral'
else: #setting the condition if the sentiment is lesser than 0 to be negative
sentiment ='negative'
sentiment_list.append(sentiment) #appending sentiment values i.e. either positive, neutral or negative to the list
except:
continue
#print(value_list) #prints the list
#print(len(sentiment_list))
# making the list into the column of dataframe
synopsis_analysis['sentiment'] = sentiment_list
#dropping nan values of the dataframe
sentiment_analysis = synopsis_analysis.dropna()
# calculating counts of sentiment movies/show
count_sentiments = sentiment_analysis.sentiment.value_counts()
#making a function to calculate the percent of sentiment values(i.e. either positive/neutral/negative)
def get_sentiment_count(df):
y_count = df.sentiment.value_counts().reset_index() #counting the values and restting the index
y_count =y_count.rename(columns={'index': "sentiment_value","sentiment":"counts"}) #renaming the columns
total = sum(y_count['counts']) #calcualting the sum
y_count['percent'] = y_count['counts'].apply(lambda x: (x/total)*100 ) #applying the percent function using lambda
return y_count #returning the dataframe
high_count = get_sentiment_count(sentiment_analysis)
#plotting the percentage of sentiment for high-rated movies
fig = go.Figure(data=[go.Pie(labels=high_count['sentiment_value'], values=high_count['percent'], hole=.3)])
fig.update_layout(
title_text="Synopsis sentiment of high-rated movies",
# Add annotations in the center of the donut pies.
annotations=[dict(text='sentiments', x=0.50, y=0.5, font_size=14, showarrow=False)])
fig.show()
#this is good news for people like my aunt, as she prefers to watch the movie that has positive sentiments.
#GETTING sentiment data for high-rated
high_rated = sentiment_analysis[sentiment_analysis.rating_standard == 'High-rated']
#getting sentiment data for low-rated
low_rated = sentiment_analysis[sentiment_analysis.rating_standard == 'Low-rated']
#dataframe with high positive sentiments among high-rated movies
only_high_pos = high_rated[high_rated['sentiment'] == 'positive']
#dataframe with high negative sentiments among high-rated movies
only_high_neg = high_rated[high_rated['sentiment'] == 'negative']
#finding popular words to look for high-rated movie/shows with positive sentiments
high_pos_words = get_pattern(only_high_pos) #calling a previous function
#finding popular words to look for high-rated movie/shows with negative sentiments
high_neg_words = get_pattern(only_high_neg)#calling a previous function
#plotting word cloud that shows the popoular words to look to find high-rated positive sentiment movie/show
text = " ".join(high_pos_words) #joining all the the words
stopwords = set(STOPWORDS) #setting stopwords
stopwords.update(["series",'show']) #updating stopwords with series and show as these words might not be that significant
#creating a word cloud image
wc = WordCloud(stopwords =stopwords, background_color='white',max_words =1000)
#generate a wordcloud
wc.generate(text)
#show
plt.figure(figsize=[15,7])
plt.title("Positive words to find high-rated movies with positive-sentiment", fontsize = 30)
plt.imshow(wc)
plt.axis('off')
plt.show()
Countvectorizer is used to transform a given text into a vector on the basis of the frequency (count) of each word that occurs in the entire text. And ‘cosine_similarity’ is used to find the similarity. Using cossine similarity means to calculate the cosine of the angle between two vectors. It does not mean finding straight line distance between two points
#getting specific columns from the dataframe
new_df_recommend = n_data_filter.loc[:,['type','title','director','cast','listed_in']]
new_df =new_df_recommend.copy() #making a copy of the dataframe
new_df = new_df.fillna('') #filling na values
#new_df
#defining a function to clean data according to the need
def clean_data(x):
#Check if string exists. If not, return empty string
if isinstance(x, str):
return str.lower(x.replace(" ", "")) #making string lowercase and replacing whitespace
else:
return ''
# using for loop and calling the function to clean the data
columns = ['type','title','director','cast','listed_in']
for col in columns:
new_df[col] = new_df[col].apply(clean_data)
#defining function that comines all the columns together
def combine_cols(x):
return ''.join(x['type']) + ' ' + ''.join(x['title']) + ' ' + x['director'] + ' ' + ''.join(x['cast'])+' ' + '' .join(x['listed_in'])
#applying the defined functions
new_df['combined_cols'] = new_df.apply(combine_cols , axis=1)
# new_df['combined_cols']
#making another recommend function using countvectorizer
def recommend_func(x,user_title,new_df_recommend):
#calling countvectorizer and using stop_words functionality to remove the unwanted words
count_convert_vector = CountVectorizer(stop_words='english')
#Convert a collection of text documents to a matrix of counts
count_convert_vector_matrix = count_convert_vector.fit_transform(x['combined_cols'])
# Compute the Cosine Similarity matrix based on the matrix of counts
from sklearn.metrics.pairwise import cosine_similarity
#calling a cosine_similarity function and passing the counts
similarity_metric = cosine_similarity(count_convert_vector_matrix, count_convert_vector_matrix)
# Reset index of your main DataFrame and construct reverse mapping as before
try:
x = x.reset_index()
index_col = pd.Series(x.index, index=x['title'])
except:
index_col = pd.Series(x.index, index=x['title'])
################ recommendations_title_by_genre(user_title, similarity_metric)
user_title =user_title.replace(' ','').lower()
# Get the index of the movie that matches the title
index_val = index_col[user_title]
print("The 5 shows related to your title are:\n")
# Get the pairwsie similarity scores of all movies with that movie
similarity_scores = enumerate(similarity_metric[index_val])
# Sort the movies based on the similarity scores
#using the sort function to filter the scores and arrange them in descending order
similarity_scores = sorted(similarity_scores, key=lambda a: a[1], reverse=True)
# Get the scores of the 5 most similar movies
similarity_scores = similarity_scores[1:6]
# Get the movie indices
movie_index_val=[]
#usig for loop to get the scores and appending it to new list
for similar in similarity_scores:
value = similar[0]
movie_index_val.append(value)
# Return the top 5 most similar movies
my_list = new_df_recommend['title'].iloc[movie_index_val]
for my in my_list: #using for loop to print one by one
print(my)
#defining another function to get the title that the user chose
def get_df_title_genre(new_df,user_genre, user_title):
#replacing the whitespaces because our all the titles and other features are combined and removed whitespaces for processing
user_genre = user_genre.replace(' ','').lower()
#getting data frame with specific genre from listed_in columns
x = new_df[new_df.listed_in == user_genre]
#calling the function and passing the dataframe, title and genre to the function
recommend_func(x,user_title,new_df_recommend)
def get_df_title(new_df,user_title): #defining the function that will get the title
x = new_df.copy() #making the new dataframe
recommend_func(x,user_title,new_df_recommend)#calling the function and passing the title name, data frame
def choose_genre(user_genre): #defining the function to let the user choose genre
# getting the specific genre according to user choice
movie_names =new_df_recommend[new_df_recommend['listed_in']== user_genre]
print(user_genre) #printing the user genre choice
name_list = movie_names['title'].to_list() #getting the title list of titles from the genre chose
print(name_list[0:3]) #printing three title names based on the genre chose
#defining the menu function
def menu():
while True:
#getting the input from users
inp = input("How would you like to get the recommendations? \n"
" Enter 'T' if you would like to get recommendation just by title:\n"
" Enter 'G' if you would like to get recommendation with genre and title:\n"
" Enter 'N' for no recommendations and to quit:\n").upper()
if inp == 'T': #setting the condition
user_input=input("Enter the title name to get some recommendation: ") #asking the input
print(f"Your title is:{user_input} ") #printing the title or the input that the user entered
print() #leaving one line space
print("The 5 shows related to your title are: \n") #print prompt
get_df_title(new_df,user_input) #calling the function that passes the user title and dataframe
elif inp =='G': #setting another condition
print("Genre options:\n" #printing the prompt
" (Enter) 'A'for 'Documentaries'\n"
" (Enter) 'B' for 'Stand-Up Comedy'\n"
" (Enter) 'C' for 'Dramas, Independent Movies, International Movies'\n"
' (Enter) "D" for "Kids\' TV "\n'
" (Enter) 'E' for 'Dramas, International Movies, Romantic Movies'\n"
" (Enter) 'F' for 'Action & Adventure, Sci-Fi & Fantasy'\n"
" (Enter) 'G' for 'Horror Movies, Thrillers'\n")
user_genre = input("Enter the letter for the genre you would like:\n ").upper() # prompt for user input
list_options = ['A','B','C','D','E','F','G'] #setting the list with user options
#list of genre options
genre_options = ['Documentaries','Stand-Up Comedy','Dramas, Independent Movies, International Movies', "Kids\' TV",
'Dramas, International Movies, Romantic Movies','Action & Adventure, Sci-Fi & Fantasy','Horror Movies, Thrillers']
#setting the condition and checking if the user options is in list options
if user_genre in list_options:
i = list_options.index(user_genre) #getting the index of user genre
genre_name = genre_options[i]
print(f"You selected {genre_name} genre.\n") #printing the genre name
choose_genre(user_genre = genre_name) #calling the function that will choose genre
else:
print()
#print prompt
print("You should select the above genre options. Please select genre from above options")
menu() #if the user did not enter the input from the menu provided call the menu function again
user_title =input("Enter the title name to get some recommendation like that: ") #getting title input
print()
try:
get_df_title_genre(new_df,genre_name, user_title) #calling the title function passing the user chose title and the dataframe
except Exception:
print("The title could not be found. Please enter other title name.")
menu()
elif inp == 'N': #setting condition to validate the user choice
return "Thank you!!!!!"
break #breaking the loop if user entered the invalid input
else:
print("Please select the correct options.")
continue #continuing the loop to provide the user to continue with the program
menu() #calling the function
#n_data_filter.head(1) #taking a glance of the dataframe
# getting specific columns from the dataframe
data_knn = n_data_filter.loc[:,['netflixid','title','director','cast','rating']]
#dropping the nan values from the dataframe
data_knn = data_knn.dropna()
#data_knn #looking how the dataframe looks like
Now we want to preidict the rating on based on cast, director, and rating category (TVMA, TV-14) of the movie/show. This technique is also called the supervised machine learning because we actually know what we want to predict. We know that all the features we are going to choose is in string format to we have to convert them to binary for classification
#converting the cast names to binary
#making a empty list
castList = []
#using the for loop to loov over the cast column of dataframe to get the speciifc cast names
for ind, record in data_knn.iterrows():
cast = record["cast"]
# getting each cast names froma all the cast list
for i in cast:
#validating if the cast name is in the list because we do not want the same cast name to be repeated
if i not in castList:
castList.append(i) #only appending the cast name that is not in the list
#print(castList)
# once we have each cast name we have to convert it to 1 or 0's
#so we define the function called convert
def convert(x_list, data_list):
#made a new list so that we will append it later
convert_to_binary=[]
#using for loop from the list passed
for y in data_list:
if y in x_list: #setting condition to check if the list contains the items
convert_to_binary.append(1) #so if the item was in the list convert that item to 1
else:
convert_to_binary.append(0) # if the item was not in the list convert it to 0.
return convert_to_binary #returning the list
#applying the function for cast column of dataframe because we wanted to convert the cast column to binary
data_knn['cast_binary'] = data_knn['cast'].apply(lambda x: convert(x,castList))
data_knn['cast_binary'].head(2)
# making a director list so that we will have director names and append it to this list
directorList=[]
for i in data_knn['director']: #using for loop in director columns
if i not in directorList: #validating if the director name is not repeated
directorList.append(i) #appending the director names in the list
#calling a function and making a new column in dataframe with 1 and 0 values
data_knn['director_binary'] = data_knn['director'].apply(lambda x: convert(x, directorList))
data_knn.head(2)
#also we are using rating category like: TV-MA, TV-14 to predict the rating so
#making a new description list
rating_list = []
#using for loop to get the row and column value of the dataframe
for ind, record in data_knn.iterrows():
x = record["rating"]
#using for loop to get each rating category
for each in x:
if each not in rating_list: #validating that the rating category is not repeated for same record
rating_list.append(each)
# calling the function to convert it to binary and making a new column
data_knn['rating_binary'] = data_knn['rating'].apply(lambda x: convert(x,rating_list))
data_knn.head(2)
spatial is python module is used to find the distance between two points. So when we imported spatial and used cosine distance modeule, it finds the distance between two points based on cosine. Cossine distance is based on cosine similarity. Cosine similarity is finding the distance based on cos-angle drawn by those two points. For example, if two points lie in same vector or nearer, the distance between them is nominal which increases the similarity or vice-versa. Thus this concept is applied to find the recommendation system as it finds the similarity between the features (casting,director and so on).
from scipy import spatial
#defining the fuunction that finds the similarity between two features
def Similarity(firstid, secondid):
#getting specific rows
first_set = data_knn.iloc[firstid]
second_set = data_knn.iloc[secondid]
#getting the column data of cast binary of two sets
first_cast = first_set['cast_binary']
second_cast = second_set['cast_binary']
# Compute the Cosine distance of 1-D array between first_cast and second_cast
cast_distance = spatial.distance.cosine(first_cast, second_cast)
#getting the column data of director binary of two sets
first_director = first_set['director_binary']
second_director = second_set['director_binary']
# Compute the Cosine distance of 1-D array between first_director and second_director
director_distance = spatial.distance.cosine(first_director, second_director)
#getting the column data of rating binary of two sets
first_rating = first_set['rating_binary']
second_rating = second_set['rating_binary']
# Compute the Cosine distance of 1-D array between first_rating and second_rating
rating_distance = spatial.distance.cosine(first_rating, second_rating)
return director_distance + cast_distance + rating_distance # returns total distance between the two data
# for example distance bewteen first row and hundreth row is calculated
Similarity(0,99)
# to understand more.....the data for the particular rows are printed
print(data_knn.iloc[0])
print(data_knn.iloc[99])
# getting specific columns
rating_view_to_merge = netflix_data_filter.loc[:,['netflixid','rating']]
#renaming column names
rating_view_to_merge.rename(columns={'rating':'rating_num'}, inplace =True)
#mergind two dataframes
merge_for_recommendation = pd.merge(data_knn, rating_view_to_merge, on='netflixid', how='inner')
#print(merge_for_recommendation.shape)
merge_for_recommendation.head(2)
# specifying range into new variable
new_id = range(0,merge_for_recommendation.shape[0])
#making a new column with customized sequential ids
merge_for_recommendation['new_id']=new_id
#specifying
merge_for_recommendation=merge_for_recommendation.drop(columns = ['netflixid'])
#'original_title','genres','vote_average','genres_bin','cast_bin','new_id','director','director_bin','words_bin']]
merge_for_recommendation.head(15)
for ind, merge_for_recommend in merge_for_recommendation.iterrows():
print(merge_for_recommend)
break
merge_for_recommendation = merge_for_recommendation.dropna()
merge_for_recommendation.shape
def getNeighbors(standard_movie_to_comp, K):
distances = []
#using for loop to acess the index no of rows and the data of rows with the columnnames
for ind, merge_for_recommend in merge_for_recommendation.iterrows():
#checking if the value of id of series is matched with the user entered value of the series
if merge_for_recommend['new_id'] != standard_movie_to_comp['new_id'].values[0]:
#calling similarity function to find the disance between the user selected title and the title located in dataframe
dist = Similarity(standard_movie_to_comp['new_id'].values[0], merge_for_recommend['new_id'])
#appending the value of distances
distances.append((merge_for_recommend['new_id'], dist))
#sorting the distances, constructing iterable object and fetching the 1st element out of it.
distances.sort(key=operator.itemgetter(1))
neighbors = [] #empty list
#using for loop to append the distances
for x in range(K):
neighbors.append(distances[x])
return neighbors
def predict_rating():
# asking title from the user
#using try and except block
try:
name_movie = input('Enter a movie title: ')
print('')
#looking for the users title in the dataframe. Once the title is found picking whole row and converting it to dataframe and transposing it
new_movie = merge_for_recommendation[merge_for_recommendation['title'].str.contains(name_movie)].iloc[0].to_frame().T
print('Selected Movie: ',new_movie.title.values[0])
K = 5 #supposing value of k to be 5 and Rating to be 0
Rating = 0
neighbors = getNeighbors(new_movie, K) #calling function that will get the user input and title and find the distance
for i in neighbors: #using for loop to find the rating by summing all the rating distances
Rating = Rating+merge_for_recommendation.iloc[i[0]][7]
print('\n')
Rating = Rating/K #the predicted rating
print('The predicted rating for %s is %f' %(new_movie['title'].values[0],Rating))
print('The actual rating for %s is %f' %(new_movie['title'].values[0],new_movie['rating_num']))
#print(f"The predicted rating for {new_movie['title'].values[0]} is {Rating}")
#print(f"The actual rating for {new_movie['title'].values[0]} is {new_movie['rating_num']}")
except Exception:
print("\nYour movie can not be processed/found for rating prediction. Please enter another movies like \n"
"Hold the Dark\n"
"Little Evil")
predict_rating()
predict_rating()
#try the following:
#Hold the Dark
#Gaga: Five Foot Two
# Time Trap
#Supergirl
#Little Evil
#dropping duplicates
n_data_filter =n_data.drop_duplicates(subset=['netflixid','title'])
netflix_data_filter =netflix_dat.drop_duplicates(subset=['netflixid','title'])
print(netflix_data_filter.duplicated().any()) #again checking if any duplicates left
print(n_data_filter.duplicated().any()) # checking andy duplicates
copy_count_rated.head(2)
#getting specific columns
rating_merge_classify = copy_count_rated.loc[:,['netflixid','rating_standard','rating_num']]
#merging two dataframes
merge_for_classification = pd.merge(data_knn, rating_merge_classify, on='netflixid', how='inner')
merge_for_classification.head(2)
#merge_for_classification.director.to_list()
#LabelEncoder is a utility class to help normalize labels such that they contain only values between 0 and n_classes-1
from sklearn import preprocessing
#creating labelEncoder, this will encode the y(target) variables
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
rating_encoded=le.fit_transform(merge_for_classification['rating'])
merge_for_classification['rating_encoded'] = rating_encoded
casting_encoded = le.fit_transform(merge_for_classification['cast'])
merge_for_classification['casting_encoded'] = casting_encoded
director_encoded = le.fit_transform(merge_for_classification['director'])
merge_for_classification['director_encoded'] = director_encoded
rating_standard_encoded = le.fit_transform(merge_for_classification['rating_standard'])
merge_for_classification['rating_standard_encoded'] = rating_standard_encoded
merge_for_classification
#.rating_standard_encoded.to_list()
#defining x values and y values
X = merge_for_classification[['casting_encoded','director_encoded']].values
y= merge_for_classification['rating_standard_encoded'].to_numpy()
from sklearn.model_selection import train_test_split
from sklearn import metrics
def classify_rating(c,d):
X = merge_for_classification[['casting_encoded','director_encoded']].values
y= merge_for_classification['rating_standard_encoded'].values
#print(X)
#print(y)
# Split dataset into training set and test set
X_train_first, X_test_first, y_train_first, y_test_first = train_test_split(X, y, train_size = 0.8,test_size=0.2, random_state=4) # 70% training and 30% test
from sklearn.neighbors import KNeighborsClassifier
# Create the knn model.
# Look at the five closest neighbors.
knn = KNeighborsClassifier(n_neighbors=5)
# Fit the model on the training data.
knn.fit(X_train_first, y_train_first)
# Make point predictions on the test set using the fit model.
#predictions = knn.predict(X_test_first)
predicted= knn.predict([[c,d]])
#print(predicted)
#print(metrics.accuracy_score(y_test_first, predictions))
print(" ")
director_name = merge_for_classification[merge_for_classification['director_encoded'] == d].iloc[0][2]
movie_title = merge_for_classification[merge_for_classification['casting_encoded'] == c].iloc[0][1]
#print(f"The rating standard is: {predicted}")
if predicted == [1]:
print(f"If '{director_name}' directs '{movie_title}', the rating standard will be Low-rated.")
else:
print(f"If '{director_name}' directs '{movie_title}', the rating standard will be High-rated.")
def get_cast_director_info():
title = input("Enter a title name: ")
print(" ")
#getting title according to user
new_title = merge_for_classification[merge_for_classification['title'].str.contains(title)].iloc[0].to_frame().T
#getting encoded values from the user selection
cast_encoded = new_title.casting_encoded.values[0]
print(f"The cast name for this movie are: \n------{new_title.cast.values[0]}")
print(" ")
print(f"This movie/show is {new_title.rating_standard.values[0]}.")
#asking for the director
director =input("Enter a director name that you would like to see this movie directed: ")
#new_movie = merge_for_classification[merge_for_classification['cast'].str.contains(name)].iloc[0].to_frame().T
#getting movie data from director
n_movie = merge_for_classification[merge_for_classification['director'].str.contains(director)].iloc[0].to_frame().T
#getting director encoded
direct_encoded = n_movie.director_encoded.values[0]
return cast_encoded, direct_encoded
# Taissa Farmiga, Ben Rosenfield, Lindsay Burdge, Joshua Leonard, Jennifer Lafleur,
#Peter Vack, Dana Wheeler-Nicholson, Jason Newman, Molly McMichael'
# Jay Karas
##### note: use title name 6 Years. It is Low-rated and the director named Jay Karas will make it high-rated.
#calling a function
cast,director = get_cast_director_info()
classify_rating(cast, director)